Realtime audio analysis

Using the Web Audio API with ScriptProcessorNode.

One way to use essentia.js in real-time is using the deprecated, but still widely supported, ScriptProcessorNode interface of the Web Audio API. The following is a basic snippet showing how.

First we define the global variables needed for audio, as well as for UI (in this case, a simple button and a <div> to show results).

// global var to load essentia instance from wasm build
let essentia;
let isEssentiaInstance = false;
// global audio vars
let audioCtx;
let bufferSize = 1024; // buffer size for mic stream and ScriptProcessorNode
let mic = null;
let scriptNode = null;
let gumStream;

const AudioContext = window.AudioContext || window.webkitAudioContext;
audioCtx = new AudioContext();

let plotDiv = document.querySelector('#plotDiv'); // html div to print our results to
let recordButton = document.querySelector('#recordButton');

Next, we define a function to be called by the ScriptProcessorNode every time its input buffer is ready to be processed (i.e. it will handle the AudioProcessingEvent). In it we get the audio from the input channel, convert it to a VectorFloat type that Essentia can use, and we feed that to the RMS algorithm, in this particular case.

// ScriptNodeProcessor callback function to calculate RMS using essentia.js
function essentiaExtractorCallback(audioProcessingEvent) {
    // convert the float32 audio data into std::vector<float> for using with essentia algos
    var vectorSignal = essentia.arrayToVector( audioProcessingEvent.inputBuffer.getChannelData(0) );
    if (!vectorSignal) {
        throw "onRecordingError: empty audio signal input found!";
    }

    // check https://mtg.github.io/essentia.js/docs/api/Essentia.html#RMS
    let algoOutput = essentia.RMS(vectorSignal);
    // convert the output to js arrray
    let rmsValue = algoOutput.rms;

    plotDiv.innerText = rmsValue;
}

In order to use this in real time, we need to define a function that will request access to the microphone for live input. With the requested stream, we can create the necessary Web Audio nodes, one of them using our essentiaExtractorCallback, and connect them to each other to form our audio processing graph.

function startMicRecordStream(btnCallback) {
    if (audioCtx.state === "suspended") audioCtx.resume(); 
    if (navigator.mediaDevices.getUserMedia) {
        console.log("Initializing audio...");
        navigator.mediaDevices.getUserMedia({ audio: true, video: false })
            .then((stream) => {
                gumStream = stream;
                if (gumStream.active) {
                    mic = audioCtx.createMediaStreamSource(stream);

                    if (audioCtx.state == "suspended") {
                        audioCtx.resume();
                    }
                    
                    scriptNode = audioCtx.createScriptProcessor(bufferSize, 1, 1);
                    // onprocess callback (where we perform our analysis with essentia.js)
                    scriptNode.onaudioprocess = essentiaExtractorCallback;
                    mic.connect(scriptNode);
                    scriptNode.connect(audioCtx.destination);

                    btnCallback(); // restore button state
                } else {
                    throw "Mic stream not active";
                }
            }
        ).catch((message) => {
            throw "Could not access microphone - " + message;
        });

    } else {
    throw "Could not access microphone - getUserMedia not available";
    }
}

We also define a function that will run when we click the "Stop" button, to stop the live microphone input (and do some minimal UI state changes). It's important to disconnect the audio nodes at this stage, otherwise the previous mic and scriptNode nodes will remain connected, consume unnecessary memory and CPU, and cause our displayed results to glitch, since they will be using the same AudioProcessingEvent callback.

function stopMicRecordStream() {
    audioCtx.suspend().then(() => {
        // stop mic stream
        gumStream.getAudioTracks().forEach(function(track) {
            track.stop();
        });
        recordButton.classList.remove("recording");
        recordButton.innerHTML = 'Mic   <i class="microphone icon"></i>';
        
        mic.disconnect();
        scriptNode.disconnect();
        plotDiv.innerText = "";
    });
}

Lastly, we specify a click event handler for our Start/Stop button, which asynchronously loads the essentia.js backend, uses it to instantiate the core API, and triggers our previously defined startMicRecordStream to capture live audio from the user's microphone and start the analysis.

window.onload = () => {
    recordButton.onclick = function() {
        var recording = this.classList.contains("recording");
        if (!recording) {
            this.setAttribute("disabled", true);

            EssentiaWASM().then(function(essentiaModule) {
                if (!isEssentiaInstance) {
                    essentia = new Essentia(essentiaModule);
                    isEssentiaInstance = true;
                }
                startMicRecordStream(enableButton); // `enableButton` is just a function that re-enables the Start/Stop button
            });
        } else {
            stopMicRecordStream();
        }
    };
}

For a more full-fledged example, take a look at our HPCP Chroma RT demo, which uses this technique.

Using the Web Audio API with AudioWorklets.

Another option for high-performance real-time audio on the web, more efficient, and non-deprecated, is AudioWorklets. Our online demo for real-time RMS level detection uses this approach. You can check out the code here, but the following snippets will go over the main differences from using ScriptProcessorNode:

Getting a live audio stream from the user's microphone is done as above (using navigator.mediaDevices.getUserMedia), as is setting up the audio graph, connecting the nodes.
When setting up the audio graph, we have to create an AudioWorkletNode instead of a ScriptProcessorNode. This will allow us to run our custom audio code on its own rendering thread, separate from the UI. The following code returns one such node:

// main.js
const workletProcessorCode = "essentia-worklet-processor.js";

export async function createEssentiaNode (audioCtx) {
  try {
    await audioCtx.audioWorklet.addModule(workletProcessorCode); // add our custom code to the worklet scope and register our processor as `essentia-worklet-processor`
  } catch(e) {
    console.log(e);
  }
  return new AudioWorkletNode(audioCtx, 'essentia-worklet-processor'); // instantiate our custom processor as an AudioWorkletNode
}

This is used in the audio graph setup function as follows:

    micNode = audioContext.createMediaStreamSource(gumStream);
    // ...
    // create essentia node only once (avoid registering processor repeatedly)
    if (!essentiaNode) {
        essentiaNode = await createEssentiaNode(audioContext);
    }

    micNode.connect(essentiaNode);

Finally, this is our custom audio code, defined inside essentia-worklet-processor.js (which we used in createEssentiaNode). It inherits from AudioWorkletProcessor:

// essentia-worklet-processor.js
import { EssentiaWASM } from "https://cdn.jsdelivr.net/npm/essentia.js@<version>/dist/essentia-wasm.es.js";
import Essentia from "https://cdn.jsdelivr.net/npm/essentia.js@<version>/dist/essentia.js-core.es.js";

let essentia = new Essentia(EssentiaWASM);

class EssentiaWorkletProcessor extends AudioWorkletProcessor {
  constructor() {
    super();
    this.essentia = essentia;
    console.log('Backend - essentia:' + this.essentia.version + '- http://essentia.upf.edu'); 
  }
  
  //System-invoked process callback function.
  process(inputs, outputs, parameters) {

    // <inputs> and <outputs> will have as many as were specified in the options passed to the AudioWorkletNode constructor, each subsequently spanning potentially multiple channels
    let input = inputs[0];
    let output = outputs[0];
    
    // convert the input audio frame array from channel 0 to a std::vector<float> type for using it in essentia
    let vectorInput = this.essentia.arrayToVector(input[0]);

    // In this case we compute the Root Mean Square of every input audio frame
    // check https://mtg.github.io/essentia.js/docs/api/Essentia.html#RMS 
    let rmsFrame = this.essentia.RMS(vectorInput) // input audio frame

    output[0][0] = rmsFrame.rms;

    return true; // keep the process running
  }
}

registerProcessor('essentia-worklet-processor', EssentiaWorkletProcessor); // must use the same name we gave our processor in `createEssentiaNode`

Cross-browser support

Since the fetch or XHR APIs are not available in the Worklets' scope, and ES6 module imports in AudioWorklets are only available in Chrome, we need some alternative way of loading essentia.js inside AudioWorklets. The async URLFromFiles() function used in ringbuf.js can be used to fetch and concatenate our custom code together with the essentia.js library on the main thread, where AudioWorkletNode is created. This code allows our custom processing to work also on Firefox and Edge.

// main.js
const workletProcessorCode = ["https://cdn.jsdelivr.net/npm/essentia.js@<version>/dist/essentia-wasm.umd.js", "https://cdn.jsdelivr.net/npm/essentia.js@<version>/dist/essentia.js-core.es.js", "essentia-worklet-processor.js"];

export async function createEssentiaNode (audioCtx) {
  try {
    let concatenatedCode = await URLFromFiles(workletProcessorCode)
    await audioCtx.audioWorklet.addModule(concatenatedCode); // add our custom code to the worklet scope
  } catch(e) {
    console.log(e);
  }
  return new AudioWorkletNode(audioCtx, 'essentia-worklet-processor');
}

And the import inside our custom processor code would look like this:

// essentia-worklet-processor.js
let essentia = new Essentia(EssentiaWASM);

You can find more real-time analysis examples in our online demos.

2. Real-time analysis

Realtime audio analysis

Using the Web Audio API with ScriptProcessorNode.

Using the Web Audio API with AudioWorklets.

Cross-browser support